{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## RCV1-v2 Dataset [source here](http://www.jmlr.org/papers/volume5/lewis04a/lyrl2004_rcv1v2_README.htm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using Theano backend.\n"
     ]
    }
   ],
   "source": [
    "import logging\n",
    "\n",
    "from sklearn.datasets import fetch_rcv1\n",
    "from sklearn.multiclass import OneVsRestClassifier\n",
    "from sklearn.metrics import f1_score, precision_score, recall_score\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn import svm\n",
    "from keras.layers import Input, Dense\n",
    "from keras.models import Model\n",
    "\n",
    "logging.basicConfig()\n",
    "rcv1 = fetch_rcv1()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "MemoryError",
     "evalue": "",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mMemoryError\u001b[0m                               Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-13-c875bed75df7>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m      1\u001b[0m \u001b[0mtraining_samples\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;36m23149\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      2\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m----> 3\u001b[1;33m \u001b[0mX_train\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mrcv1\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdata\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;33m:\u001b[0m\u001b[0mtraining_samples\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtodense\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      4\u001b[0m \u001b[1;31m# X_test = rcv1.data[training_samples:].todense()\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m      5\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/scipy/sparse/base.pyc\u001b[0m in \u001b[0;36mtodense\u001b[1;34m(self, order, out)\u001b[0m\n\u001b[0;32m    629\u001b[0m             \u001b[1;33m`\u001b[0m\u001b[0mnumpy\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmatrix\u001b[0m\u001b[1;33m`\u001b[0m \u001b[0mobject\u001b[0m \u001b[0mthat\u001b[0m \u001b[0mshares\u001b[0m \u001b[0mthe\u001b[0m \u001b[0msame\u001b[0m \u001b[0mmemory\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    630\u001b[0m         \"\"\"\n\u001b[1;32m--> 631\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0masmatrix\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    632\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    633\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mtoarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/scipy/sparse/compressed.pyc\u001b[0m in \u001b[0;36mtoarray\u001b[1;34m(self, order, out)\u001b[0m\n\u001b[0;32m    938\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mtoarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    939\u001b[0m         \u001b[1;34m\"\"\"See the docstring for `spmatrix.toarray`.\"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 940\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtocoo\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcopy\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    941\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    942\u001b[0m     \u001b[1;31m##############################################################\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/scipy/sparse/coo.pyc\u001b[0m in \u001b[0;36mtoarray\u001b[1;34m(self, order, out)\u001b[0m\n\u001b[0;32m    248\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mtoarray\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    249\u001b[0m         \u001b[1;34m\"\"\"See the docstring for `spmatrix.toarray`.\"\"\"\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 250\u001b[1;33m         \u001b[0mB\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_process_toarray_args\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    251\u001b[0m         \u001b[0mfortran\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mB\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mflags\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mf_contiguous\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    252\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mfortran\u001b[0m \u001b[1;32mand\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mB\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mflags\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mc_contiguous\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/scipy/sparse/base.pyc\u001b[0m in \u001b[0;36m_process_toarray_args\u001b[1;34m(self, order, out)\u001b[0m\n\u001b[0;32m    815\u001b[0m             \u001b[1;32mreturn\u001b[0m \u001b[0mout\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    816\u001b[0m         \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 817\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mnp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mzeros\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdtype\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0morder\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0morder\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    818\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    819\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0m__numpy_ufunc__\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mfunc\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mpos\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0minputs\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mMemoryError\u001b[0m: "
     ]
    }
   ],
   "source": [
    "training_samples = 23149\n",
    "\n",
    "X_train = rcv1.data[:training_samples].todense()\n",
    "# X_test = rcv1.data[training_samples:].todense()\n",
    "\n",
    "# X_train_rshp = X_train.reshape((len(X_train), np.prod(X_train.shape[1:])))\n",
    "# X_test_rshp = X_test.reshape((len(X_test), np.prod(X_test.shape[1:])))\n",
    "# y_train = rcv1.target[:training_samples]\n",
    "# y_test = rcv1.target[training_samples:]\n",
    "\n",
    "X_train.shape\n",
    "\n",
    "# X_train.shape, X_train_rshp.shape,  X_test.shape, X_test_rshp.shape, y_train.shape, y_test.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "ename": "Exception",
     "evalue": "Error when checking model input: expected input_2 to have shape (None, 23149) but got array with shape (23149, 47236)",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mException\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-7-83af873d7e21>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[0;32m     20\u001b[0m                 \u001b[0mbatch_size\u001b[0m\u001b[1;33m=\u001b[0m\u001b[1;36m256\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     21\u001b[0m                 \u001b[0mshuffle\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mTrue\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 22\u001b[1;33m                 validation_data=(X_test, X_test))\n\u001b[0m",
      "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/keras/engine/training.pyc\u001b[0m in \u001b[0;36mfit\u001b[1;34m(self, x, y, batch_size, nb_epoch, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight)\u001b[0m\n\u001b[0;32m   1033\u001b[0m                                                            \u001b[0mclass_weight\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mclass_weight\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1034\u001b[0m                                                            \u001b[0mcheck_batch_dim\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1035\u001b[1;33m                                                            batch_size=batch_size)\n\u001b[0m\u001b[0;32m   1036\u001b[0m         \u001b[1;31m# prepare validation data\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1037\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mvalidation_data\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/keras/engine/training.pyc\u001b[0m in \u001b[0;36m_standardize_user_data\u001b[1;34m(self, x, y, sample_weight, class_weight, check_batch_dim, batch_size)\u001b[0m\n\u001b[0;32m    960\u001b[0m                                    \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0minternal_input_shapes\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    961\u001b[0m                                    \u001b[0mcheck_batch_dim\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mFalse\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 962\u001b[1;33m                                    exception_prefix='model input')\n\u001b[0m\u001b[0;32m    963\u001b[0m         y = standardize_input_data(y, self.output_names,\n\u001b[0;32m    964\u001b[0m                                    \u001b[0moutput_shapes\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m/home/felipe/venv2-global/local/lib/python2.7/site-packages/keras/engine/training.pyc\u001b[0m in \u001b[0;36mstandardize_input_data\u001b[1;34m(data, names, shapes, check_batch_dim, exception_prefix)\u001b[0m\n\u001b[0;32m    106\u001b[0m                                         \u001b[1;34m' to have shape '\u001b[0m \u001b[1;33m+\u001b[0m \u001b[0mstr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mshapes\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;33m+\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    107\u001b[0m                                         \u001b[1;34m' but got array with shape '\u001b[0m \u001b[1;33m+\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 108\u001b[1;33m                                         str(array.shape))\n\u001b[0m\u001b[0;32m    109\u001b[0m     \u001b[1;32mreturn\u001b[0m \u001b[0marrays\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    110\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mException\u001b[0m: Error when checking model input: expected input_2 to have shape (None, 23149) but got array with shape (23149, 47236)"
     ]
    }
   ],
   "source": [
    "encoding_dim = 32\n",
    "\n",
    "input_doc = Input(shape=(23149,))\n",
    "\n",
    "encoded = Dense(encoding_dim,activation='relu')(input_doc)\n",
    "decoded = Dense(23149, activation='sigmoid')(encoded)\n",
    "\n",
    "autoencoder = Model(input=input_doc, output=decoded)\n",
    "\n",
    "encoder = Model(input=input_doc, output=encoded)\n",
    "\n",
    "encoded_input = Input(shape=(encoding_dim,))\n",
    "decoder_layer = autoencoder.layers[-1]\n",
    "decoder = Model(input=encoded_input, output=decoder_layer(encoded_input))\n",
    "\n",
    "autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')\n",
    "\n",
    "autoencoder.fit(X_train, X_train,\n",
    "                verbose=2,\n",
    "                nb_epoch=50,\n",
    "                batch_size=256,\n",
    "                shuffle=True,\n",
    "                validation_data=(X_test, X_test))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "clf = OneVsRestClassifier(svm.LinearSVC(penalty='l1',tol=0.01,multi_class='crammer_singer',dual=False))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/felipe/venv2/local/lib/python2.7/site-packages/sklearn/multiclass.py:70: UserWarning: Label not 49 is present in all training examples.\n",
      "  str(classes[c]))\n",
      "/home/felipe/venv2/local/lib/python2.7/site-packages/sklearn/multiclass.py:70: UserWarning: Label not 80 is present in all training examples.\n",
      "  str(classes[c]))\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,\n",
       "     intercept_scaling=1, loss='squared_hinge', max_iter=1000,\n",
       "     multi_class='crammer_singer', penalty='l1', random_state=None,\n",
       "     tol=0.01, verbose=0),\n",
       "          n_jobs=1)"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "clf.fit(X_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "y_pred = clf.predict(X_test)\n",
    "\n",
    "current_score = f1_score(y_test,y_pred,average='micro')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.80843419139591599"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "current_score"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
